This code generates music using two RNNs (1) - A Generalist that does not take into account the style of the different composers (2) - The Specialist, which tries to learn the style of each composer.
The project consists of three main parts:
class Generalist(nn.Module):
""" The generalists composes music without learning the difference between
different composers using an LSTM.
"""
def __init__(self, input_size, hidden_size, num_layers):
super(Generalist, self).__init__()
self.input_size = input_size
self.output_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.hidden = None
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=(0 if num_layers ==1 else 0.5))
self.hidden_to_output = nn.Linear(hidden_size, self.output_size)
def init_hidden(self, tag):
""" Ignore tag - this is just to make models compatible across Generalist and Specialist """
return (torch.autograd.Variable(torch.zeros(self.num_layers, 1, self.hidden_size)),
torch.autograd.Variable(torch.zeros(self.num_layers, 1, self.hidden_size)))
def forward(self, inputs, tag=None, hidden=None):
# Since this is the generalist, don't pass any category info to the model
hidden = self.hidden if hidden is None else hidden
output, self.hidden = self.lstm(inputs, hidden)
output = self.hidden_to_output(output)
return output, self.hidden
class Specialist(nn.Module):
""" The specialist composes music based on the composer using an LSTM and an embedding. """
def __init__(self, input_size, hidden_size, num_layers, num_composers=4):
super(Specialist, self).__init__()
self.input_size = input_size
self.output_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding_size = hidden_size
self.num_embeddings = num_composers
self.hidden = None
self.tag_to_hidden = nn.Embedding(num_embeddings = num_composers, embedding_dim = self.embedding_size)
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.hidden_to_output = nn.Linear(hidden_size, self.output_size)
def init_hidden(self, tag):
""" The specialist maps each tag to a starting hidden state.
The goal is to learn to differentiate composers"""
hidden_start = self.tag_to_hidden(tag)
cell_start = torch.autograd.Variable(torch.zeros(self.num_layers, 1, self.hidden_size))
return (hidden_start, cell_start)
def forward(self, inputs, tag=None, hidden=None):
# Specialist => only pass tag as initial hidden state
hidden = self.hidden if hidden is None else hidden
output, self.hidden = self.lstm(inputs, hidden)
output = self.hidden_to_output(output)
return output, self.hidden
def gen_batch(inputs, targets, batch_size = 32, overlap=8):
assert len(inputs) == len(targets), "Inputs and targets must have same length"
length = len(inputs)
for idx in range(0, length, batch_size-overlap):
last_idx = min(idx + batch_size - overlap, length)
yield inputs[idx:last_idx], targets[idx:last_idx]
import time
INPUT_SIZE = 128
HIDDEN_SIZE = 256
NUM_HIDDEN_LAYERS = 1
#loss_func = nn.MSELoss()
#Tsloss_func = nn.L1Loss()
#loss_func = nn.BCELoss() NOTE! Requires range of outputs to be between 0 and 1
#optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)
#optimizer = optim.SparseAdam(model.parameters(), lr=1e-3)
loss_func = nn.BCEWithLogitsLoss()
NUM_EPOCHS = 5
NUM_SONGS = 2
BATCH_SIZE = 100
OVERLAP = 50
specialist = Specialist(INPUT_SIZE, HIDDEN_SIZE, NUM_HIDDEN_LAYERS)
generalist = Generalist(INPUT_SIZE, HIDDEN_SIZE, NUM_HIDDEN_LAYERS)
def train(model):
optimizer = optim.Adam(model.parameters(), lr=1e-3)
start = time.time()
losses = []
for epoch in range(NUM_EPOCHS+1):
CUR_SONG = 0
LOSS_PER_EPOCH = 0
for inputs, tags, targets in dataset:
CUR_SONG += 1
# Since indexing of dataset returns wrong datatype
if CUR_SONG > NUM_SONGS:
continue
BATCH_LOSS = 0
for input_batch_seq, target_batch_seq in gen_batch(inputs, targets, BATCH_SIZE, OVERLAP):
model.zero_grad() # By default, PyTorch cumulates gradients, so remove them
# Detach hidden from history to clear out hidden state
model.hidden = model.init_hidden(tags)
# Forward pass
output_batch_seq, _ = model(input_batch_seq)
# Calculate loss and gradients
loss = loss_func(output_batch_seq, target_batch_seq)
loss.backward()
optimizer.step()
BATCH_LOSS += loss.cpu().detach().numpy()
print("Epoch {}/{}, processing song {}/{}. Batch Loss: {}".format(epoch, NUM_EPOCHS, CUR_SONG, NUM_SONGS,BATCH_LOSS), end='\r')
LOSS_PER_EPOCH += BATCH_LOSS
losses.append(LOSS_PER_EPOCH)
print("Done!")
end = time.time()
print(f"Trained for {NUM_EPOCHS} on {NUM_SONGS} - time used: {end-start} seconds")
return losses
def threshold(tensor, threshold=0.5):
""" Thresholds the tensor in order to binarize it """
return tensor > threshold
def avg_keypress(array, min_threshold):
""" Accepts a binary array and calculates the average number of keypresses in the song"""
return np.sum(array > min_threshold)/array.shape[0]
def find_threshold(song_array, minval=-100, maxval=100, keypresses_per_timestep=2.5, max_iter=15):
""" Finds the threshold that makes the average number of keypresses per timestep as close as possible
to the determined number of keypresses by binary search.
"""
for cur_iter in range(max_iter):
val = (minval + maxval)/2
avg_kp = avg_keypress(song_array, val)
# More keypresses than wanted => Increase threshold
if avg_kp > keypresses_per_timestep:
minval = val
# Less keypresses than wanted => Decrease threshold
else:
maxval = val
return val
print("Training generalist...")
gen_loss = train(generalist)
print("Training specialist...")
spec_loss = train(specialist)
Training generalist... Done! 5/5, processing song 2/2. Batch Loss: 1.4565308801829815 Trained for 5 on 2 - time used: 13.074855327606201 seconds Training specialist... Done! 5/5, processing song 2/2. Batch Loss: 1.3102565966546535 Trained for 5 on 2 - time used: 13.966777086257935 seconds
%matplotlib inline
import matplotlib.pyplot as plt
plt.title('Total loss per epoch')
plt.plot(range(len(spec_loss)), spec_loss, '-', range(len(gen_loss)), gen_loss, '--')
plt.legend(['Specialist', 'Generalist'])
<matplotlib.legend.Legend at 0x7f19ba0c7c18>
In order for the model to generate a new song, we pass in the sequence one timestep at a time, and get the hidden state out. The output and the hidden state will be fed into the model at the next timestep.
def generate_song(model, song, composer, num_timesteps=10):
# Uses the first few timesteps from a song and lets the model generate the next
output_length = song.shape[0]
output_threshold = 0
song_timesteps = song[1:num_timesteps+1, :]
new_song = np.zeros_like(song)
hidden = model.init_hidden(composer)
for timestep in range(output_length):
if timestep < num_timesteps:
inputs = song_timesteps[timestep, :].unsqueeze(0)
else:
inputs = output.detach()
inputs = inputs.ge(output_threshold).float()
output, hidden = model(inputs, hidden)
new_song[timestep, :] = output.detach().cpu().numpy()
return new_song.squeeze()
"""
Lets generate different styles of music based on the same song with the given model
"""
def generate_composer_styles(model, song, num_timesteps=10, num_styles=4):
specialist_songs = []
for i in range(num_styles):
print("Generating song {} of {}".format(i+1, num_styles), end='\r')
song_gen = generate_song(model, song, composer=torch.LongTensor(0), num_timesteps=num_timesteps)
thresh = find_threshold(song_gen, keypresses_per_timestep=2.5)
song_gen = threshold(song_gen, thresh)
visualize_piano_roll(song_gen.T)
specialist_songs.append(song_gen)
return specialist_songs
We can load pretrained models in order to skip training. All the song generation methods used above can be called to generate new music with the loaded model.
!ls models/
generalist_1000_100_50_FINAL_EXPORT.pt specialist_100_32_16_FINAL.pt generalist_100_20_10.pt specialist_20_20_10_.pt generalist_100_32_16_FINAL.pt specialist_20_20_10_sparse.pt specialist_1000_100_50_FINAL_EXPORT.pt
specialist = Specialist(INPUT_SIZE, HIDDEN_SIZE, NUM_HIDDEN_LAYERS)
specialist.load_state_dict(torch.load('models/specialist_1000_100_50_FINAL_EXPORT.pt', map_location='cpu'))
generalist = Generalist(INPUT_SIZE, HIDDEN_SIZE, NUM_HIDDEN_LAYERS)
generalist.load_state_dict(torch.load('models/generalist_1000_100_50_FINAL_EXPORT.pt', map_location='cpu'))
# Generate songs in the style of specialist
chosen_song = dataset[0][0]
specialist_songs = generate_composer_styles(specialist, chosen_song, num_timesteps=30, num_styles=4)
Generating song 1 of 3
Generating song 2 of 3
Generating song 3 of 3
Generating song 4 of 3
# Generate songs in the style of generalist
chosen_song = dataset[1][0]
generalist_songs = generate_composer_styles(generalist, chosen_song, num_timesteps=30, num_styles=4)
Generating song 1 of 3
Generating song 2 of 3
Generating song 3 of 3
Generating song 4 of 3
# Play specialist song
# Composers: Bach, Brahms, Debussy, Mozart
spec_song = specialist_songs[0]
embed_play_v1(spec_song.T, fs=5)